suppressPackageStartupMessages({
  import(rpkgs)
})

import(run)
import(util)
## [1] TRUE
import(perfplot)

Parameters

modelName = "lgbm L2 regression"
trnAmt = 60 * 24 * 7 * 52
tstAmt = 60 * 24 * 7 * 2 # 2 weeks, submission period will provide new data every 2 weeks
assets = getAllAssets()
## 2021-12-05 14:26:41 INFO::Sourcing ALL_ASSETS
numSamples = 100

Data

makeData = \(env, minDate, maxDate, asset_ids, ...) {
  selectStmt = glue('
    WITH
    t1 AS (
      SELECT
        ts
      , asset_name
      , target
      , LAG(target, 15) OVER (PARTITION BY asset_id ORDER BY ts ASC)
        AS target_lag_15
      , REGR_SLOPE(vwap, EXTRACT(EPOCH FROM ts)::real)
        OVER (PARTITION BY asset_id ORDER BY ts ASC ROWS 30 PRECEDING)
        AS vwap_slope_30
      , REGR_SLOPE(vwap, EXTRACT(EPOCH FROM ts)::real)
        OVER (PARTITION BY asset_id ORDER BY ts ASC ROWS 7 PRECEDING)
        AS vwap_slope_7
      , AVG(close) OVER (PARTITION BY asset_id ORDER BY ts ASC ROWS 26 PRECEDING)
        AS sma_close_26
      , AVG(close) OVER (PARTITION BY asset_id ORDER BY ts ASC ROWS 12 PRECEDING)
        AS sma_close_12
      FROM trn
      WHERE (ts BETWEEN $1 AND $2)
        AND asset_id IN ({paste(asset_ids, collapse = ", ")})
    ),
    t2 AS (
      SELECT
        *
      , sma_close_26 - sma_close_12
        AS smacd_close
      FROM t1
    ),
    t3 AS (
      SELECT
        *
      , REGR_SLOPE(smacd_close, EXTRACT(EPOCH FROM ts)::REAL)
        OVER (PARTITION BY asset_name ORDER BY ts ASC ROWS 6 PRECEDING)
        AS smacd_close_slope_6
      FROM t2
    )
    SELECT * FROM t3
  ')

  df = getQuery(selectStmt, params = list(minDate, maxDate))

  # time features
  timeFixedArg = \(periodS) 2 * pi * as.numeric(df[,ts]) / periodS
  th_arg = timeFixedArg(60*60)
  df$time_hour_sin = sin(th_arg)
  df$time_hour_cos = cos(th_arg)
  
  td_arg = timeFixedArg(60 * 60 * 24)
  df$time_day_sin = sin(td_arg)
  df$time_day_cos = cos(td_arg)
  
  tw_arg = timeFixedArg(60 * 60 * 24 * 7)
  df$time_week_sin = sin(tw_arg)
  df$time_week_cos = cos(tw_arg)

  timeYearArg = \() {
    dates = data.table(
      y = df[,ts] |> format("%Y") |> as.numeric()
    )
    dates$yp1 = dates$y + 1
    dates$ypct = dates$y |> paste0("-01-01") |> as.POSIXct(tz = "UTC")
    dates$yp1pct = dates$yp1 |> paste0("-01-01") |> as.POSIXct(tz = "UTC")
    dates$ypos = as.numeric(df[,ts]) - as.numeric(dates$ypct)
    dates$ylen = as.numeric(dates$yp1pct) - as.numeric(dates$ypct)
    2*pi*dates$ypos / dates$ylen
  }
  
  ty_arg = timeYearArg()
  df$time_year_sin = sin(ty_arg)
  df$time_year_cos = cos(ty_arg)
  
  timeMonthArg = \() {
    ds = data.table(
      t = df[,ts],
      y = df[,ts] |> format("%Y") |> as.numeric(),
      m = df[,ts] |> format("%m") |> as.numeric()
    )
    ds$ym = paste(ds$y, sprintf("%02i", ds$m), "01", sep = "-")
    ds$ymp1 = paste(
      ifelse(ds$m == 12,
        sprintf("%04i", ds$y + 1),
        sprintf("%04i", ds$y)
      ),
      ifelse(ds$m == 12,
        "01",
        sprintf("%02i", ds$m + 1)
      ),
      "01",
      sep = "-"
    )
    ds$ympct = as.POSIXct(ds$ym, tz = "UTC")
    ds$ymp1pct = as.POSIXct(ds$ymp1, tz = "UTC")
    ds$ympos = as.numeric(df[,ts]) - as.numeric(ds$ympct)
    ds$ymlen = as.numeric(ds$ymp1pct) - as.numeric(ds$ympct)
    2 * pi * ds$ympos / ds$ymlen
  }
  
  tm_arg = timeMonthArg()
  df$time_month_sin = sin(tm_arg)
  df$time_month_cos = cos(tm_arg)
  
  # move some columns from df to 'info'
  # info columns are just used for plotting/debugging, not passed to model
  env$info = data.table(
    info_ts = df[,ts],
    info_asset_name = df[,asset_name]
  )
  
  df[,"ts"] <- NULL
  df[,"asset_name"] <- NULL

  # setup target
  env$y = df[,.(target)]
  df[,"target"] <- NULL

  env$x = df
}

Model

trainModel = \(model, trn, tst, ...) {
  # give the model a description
  model$description = 'lgbm basic test'
  
  model$nameOfStandardiserFor = \(name) glue('{name}_standardiser')
  model$getStandardiser = \(name) model[[model$nameOfStandardiserFor(name)]]

  # fit X standardisers
  lapply(colnames(trn$x), \(name) {
    model[[model$nameOfStandardiserFor(name)]] = as.standardiser(trn$x[[name]])
  })
  
  # fit y standardiser
  model$target_standardiser = as.standardiser(trn$y[,target])
  
  # define forward data transform
  model$dataFwdTransform = \(env) {
    # standardise Xs
    lapply(colnames(env$x), \(name) {
      env$x[[name]] = predict(model$getStandardiser(name), env$x[[name]])
    })

    # standardise the target
    env$y[,"target"] = predict(model$target_standardiser, env$y[,target])
  }

  # fwdTranform the data
  model$dataFwdTransform(trn)
  model$dataFwdTransform(tst) # TODO should not be the same tst

  trn = lgb.Dataset(
    as.matrix(trn$x), 
    colnames = colnames(trn$x),
    label = trn$y[,target],
    init_score = rep(0, nrow(trn$x))
  )
  
  tst = lgb.Dataset(
    as.matrix(tst$x),
    colnames = colnames(tst$x),
    label = tst$y[,target]
  )

  nrounds = 5000
  
  # train the model
  model$lgbm = lgb.train(
    data = trn,
    params = list(
      num_threads = max(parallel::detectCores(FALSE) - 1, 1),
      seed = 5280,
      num_iterations = nrounds,
      num_leaves = 31,
      max_depth = 0,
      learning_rate = 0.002,
      boosting = "gbdt",
      min_data_in_leaf = 3,
      min_sum_hessian_in_leaf = 0.01,
      lambda_l1 = 0.01,
      lambda_l2 = 0.01
    ),
    valids = list(
      trn = trn,
      tst = tst
    ),
    obj = "regression_l2",
    eval = c(
      "l2"
    ),
    record = TRUE,
    early_stopping_rounds = 100
  )
}

predictModel = \(model, tst, ...) {
  # comment out as an optimisation because we already transformed it
  # # fwdTransform the data
  # model$dataFwdTransform(tst)

  # generate prediction
  tst$yhat = predict(model$lgbm, as.matrix(tst$x))
}

Method

set.seed(93864)

for (i in 1:numSamples) {
  results = doRun(
    name = modelName,
    trnAmt = trnAmt,
    tstAmt = tstAmt,
    asset_ids = assets[6,asset_id],
    makeData = makeData,
    trainModel = trainModel,
    predictModel = predictModel
  )
}

Debug Plots

These plots are just for the last run of the model.

set.seed(575927)

pd = results$tst$x
pd = cbind(pd, results$tst$info)
pd$y = results$tst$y$target
pd$yhat = results$tst$yhat

sanityCheckingPlot = \(plotType) {
  # sample of data
  numPointsForPlot = 300
  plotStart = sample(pd[1:(nrow(pd) - numPointsForPlot),info_ts], 1)
  plotEnd = plotStart + as.difftime(numPointsForPlot, units = "mins")
  
  if (plotType == "features") {
    pd[
      info_asset_name == pd[sample(nrow(pd), 1),info_asset_name]
      & info_ts > plotStart
      & info_ts < plotEnd
    ] |>
      melt(id.vars = c("info_ts", "info_asset_name")) |>
      ggplot(aes(info_ts, value, colour = variable)) +
      geom_line() +
      facet_wrap(~ info_asset_name)
  } else if (plotType == "forecast") {
    pd[
      info_asset_name == pd[sample(nrow(pd), 1),info_asset_name]
      & info_ts > plotStart
      & info_ts < plotEnd,
      .(info_ts, info_asset_name, y, yhat)
    ] |>
      melt(id.vars = c("info_ts", "info_asset_name")) |>
      ggplot(aes(info_ts, value, colour = variable)) +
      geom_line() +
      facet_wrap(~ info_asset_name, ncol = 1)
  }
}

plotAroundBiggest = \() {
  maxy = max(pd[,y], na.rm = TRUE)
  maxts = pd[y == maxy,info_ts]
  maxasset = pd[y == maxy,info_asset_name]

  plotStart = maxts - as.difftime(100, units = "mins")
  plotEnd = maxts + as.difftime(100, units = "mins")
  pd[
    info_asset_name == maxasset
    & info_ts > plotStart
    & info_ts <= plotEnd,
    .(info_ts, y, yhat)
  ] |>
    melt(id.vars = "info_ts") |>
    ggplot(aes(x = info_ts, y = value, colour = variable)) +
    geom_line()
}

getEvaluationLog = \(lgbm) {
  iteration = 1:lgbm$current_iter()
  getResult = \(dataset, metric) {
    data.table(
      iteration = iteration,
      dataset = dataset,
      metric = metric,
      value = lgb.get.eval.result(lgbm, dataset, metric)
    )
  }
  
  eval_log = rbind(
    getResult("trn", "l2"),
    getResult("tst", "l2")
  )
  
  eval_log
}

Forecasts

Features

Biggest Y

Training Log

Correlation

The competition metric is correlation between your predictions and the targets.

Visualising this:

## Warning: Removed 5 rows containing non-finite values (stat_bin2d).

Remember, that’s just for 1 run; we repeated that experiment 100 times!

Evaluation

scores = getQuery('SELECT * FROM metrics WHERE name = $1', params = list(modelName))
scores[,.(run_id, corr, mae, aae, rmse)]
p = performancePlot(scores[,corr])
print(p)
## performancePlot:
##        count = 100
##         mean = 0.0865705867931
##           sd = 0.0894275617632283
##          sem = 0.00894275617632283
##        range = 0.55296908
##     binwidth = 0.0184323026666667
## relativePlot = <ggplot plot object>
## absolutePlot = <ggplot plot object>
##       t.test = 
##  One Sample t-test
## 
## data:  x
## t = 9.6805, df = 99, p-value = 5.446e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  0.06882622 0.10431496
## sample estimates:
##  mean of x 
## 0.08657059

Absolute range performance plot

Relative range performance plot

Interactive plots

Normality checks

## 
##  Shapiro-Wilk normality test
## 
## data:  x
## W = 0.84908, p-value = 1.097e-08